{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Style-Bert-VITS2ライブラリの使用例\n",
    "\n",
    "`pip install style-bert-vits2`を使った、jupyter notebookでの使用例です。Google colab等でも動きます。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# PyTorch環境の構築（ない場合）\n",
    "# 参照: https://pytorch.org/get-started/locally/\n",
    "\n",
    "!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "LLrngKcQEAyP"
   },
   "outputs": [],
   "source": [
    "# style-bert-vits2のインストール\n",
    "\n",
    "!pip install style-bert-vits2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "9xRtfUg5EZkx"
   },
   "outputs": [],
   "source": [
    "# BERTモデルをロード（ローカルに手動でダウンロードする必要はありません）\n",
    "\n",
    "from style_bert_vits2.nlp import bert_models\n",
    "from style_bert_vits2.constants import Languages\n",
    "\n",
    "\n",
    "bert_models.load_model(Languages.JP, \"ku-nlp/deberta-v2-large-japanese-char-wwm\")\n",
    "bert_models.load_tokenizer(Languages.JP, \"ku-nlp/deberta-v2-large-japanese-char-wwm\")\n",
    "# bert_models.load_model(Languages.EN, \"microsoft/deberta-v3-large\")\n",
    "# bert_models.load_tokenizer(Languages.EN, \"microsoft/deberta-v3-large\")\n",
    "# bert_models.load_model(Languages.ZH, \"hfl/chinese-roberta-wwm-ext-large\")\n",
    "# bert_models.load_tokenizer(Languages.ZH, \"hfl/chinese-roberta-wwm-ext-large\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "q2V9d3HyFAr_"
   },
   "outputs": [],
   "source": [
    "# Hugging Faceから試しにデフォルトモデルをダウンロードしてみて、それを音声合成に使ってみる\n",
    "# model_assetsディレクトリにダウンロードされます\n",
    "\n",
    "from pathlib import Path\n",
    "from huggingface_hub import hf_hub_download\n",
    "\n",
    "\n",
    "model_file = \"jvnv-F1-jp/jvnv-F1-jp_e160_s14000.safetensors\"\n",
    "config_file = \"jvnv-F1-jp/config.json\"\n",
    "style_file = \"jvnv-F1-jp/style_vectors.npy\"\n",
    "\n",
    "for file in [model_file, config_file, style_file]:\n",
    "    print(file)\n",
    "    hf_hub_download(\"litagin/style_bert_vits2_jvnv\", file, local_dir=\"model_assets\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "hJa31MEUFhe4"
   },
   "outputs": [],
   "source": [
    "# 上でダウンロードしたモデルファイルを指定して音声合成のテスト\n",
    "\n",
    "from style_bert_vits2.tts_model import TTSModel\n",
    "\n",
    "assets_root = Path(\"model_assets\")\n",
    "\n",
    "model = TTSModel(\n",
    "    model_path=assets_root / model_file,\n",
    "    config_path=assets_root / config_file,\n",
    "    style_vec_path=assets_root / style_file,\n",
    "    device=\"cpu\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "Gal0tqrtGXZx"
   },
   "outputs": [],
   "source": [
    "from IPython.display import Audio, display\n",
    "\n",
    "sr, audio = model.infer(text=\"こんにちは\")\n",
    "display(Audio(audio, rate=sr))"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
